import pandas as pd
import numpy as np
import tushare as ts
from datetime import datetime
import statsmodels.api as sm

token = 'd942e6ff0e981f76aaa544f84405583e0c2129a8c82213637835a099'
pro = ts.pro_api(token)


def get_adj_data(code, freq='daily', start_date=None, end_date=None):
    if freq == 'daily':
        df = pro.daily(ts_code=code, start_date=start_date, end_date=end_date)
    elif freq == 'monthly':
        df = pro.monthly(ts_code=code, start_date=start_date, end_date=end_date)

    if len(df) > 1:
        df = df.sort_values(by=['trade_date']).reset_index(drop=True)
        df = df[['trade_date', 'open', 'high', 'low', 'close','vol','amount']]
        df_factor = pro.adj_factor(ts_code=code)
        df = pd.merge(df, df_factor, on=['trade_date'])
        now_adj = df.loc[len(df) - 1, 'adj_factor']
        df['adjs'] = df['adj_factor'] / now_adj
        df_adjs = pd.concat([df['adjs']] * 4, axis=1)
        df_adjs.columns = ['open', 'high', 'low', 'close']
        df[['open', 'high', 'low', 'close']] = df[['open', 'high', 'low', 'close']] * df_adjs
    else:
        pd.DataFrame(columns=['open', 'high', 'low', 'close','volume','amount'])
    return df


def get_data(code):

    dd1 = pro.income(ts_code=code)
    dd2 = pro.balancesheet(ts_code=code)
    dd3 = pro.cashflow(ts_code=code)
    dd4 = pro.fina_indicator(ts_code=code)

    dd1 = dd1.drop(['ebit', 'ebitda'], axis=1)

    dd1 = dd1.sort_values(by=['ann_date']).reset_index(drop=True)
    dd2 = dd2.sort_values(by=['ann_date']).reset_index(drop=True)
    dd3 = dd3.sort_values(by=['ann_date']).reset_index(drop=True)
    dd4 = dd4.sort_values(by=['ann_date']).reset_index(drop=True)

    dd1 = dd1.drop_duplicates(subset=['f_ann_date'], keep='last').reset_index(drop=True)
    dd2 = dd2.drop_duplicates(subset=['f_ann_date'], keep='last').reset_index(drop=True)
    dd3 = dd3.drop_duplicates(subset=['f_ann_date'], keep='last').reset_index(drop=True)
    dd4 = dd4.drop_duplicates(subset=['ann_date'], keep='last').reset_index(drop=True)

    dd1 = dd1.drop(['ts_code', 'f_ann_date', 'end_date', 'report_type', 'comp_type'], axis=1)
    dd2 = dd2.drop(['ts_code', 'f_ann_date', 'end_date', 'report_type', 'comp_type'], axis=1)
    dd3 = dd3.drop(['ts_code', 'f_ann_date', 'end_date', 'report_type', 'comp_type'], axis=1)
    dd4 = dd4.drop(['ts_code', 'end_date'], axis=1)

    dd = pd.merge(dd1, dd2, on=['ann_date'], how='outer')
    dd = pd.merge(dd, dd3, on=['ann_date'], how='outer')
    dd = pd.merge(dd, dd4, on=['ann_date'], how='outer')

    dd.rename(columns={'ann_date': 'datetime'}, inplace=True)
    dd = dd.sort_values(['datetime']).reset_index(drop=True)

    # dd_dates = pro.trade_cal(exchange='SSE',start_date=dd.loc[0,'datetime'],end_date=dd.loc[dd.index[-1],'datetime'])
    dd_dates = pro.trade_cal(exchange='SSE', start_date=dd.loc[0, 'datetime'],
                             end_date=datetime.strftime(datetime.now(), '%Y%m%d'))
    dd_dates = dd_dates[['cal_date']]
    dd_dates.columns = ['datetime']

    dd = pd.merge(dd, dd_dates, on=['datetime'], how='right').fillna(method='ffill')

    dd5 = pro.daily_basic(ts_code=code)
    # 每日指标中的total_share应该是基于标准财报日的，而dd2中资产负债表中的total_share按照公告日期更新
    dd5 = dd5.drop(['ts_code', 'total_share'], axis=1)
    dd5.rename(columns={'trade_date': 'datetime'}, inplace=True)
    dd5 = dd5.sort_values(by=['datetime']).reset_index(drop=True).fillna(method='ffill')

    dd = pd.merge(dd, dd5, on=['datetime'], how='left').fillna(method='ffill')
    return dd


def addfield(dd, code):
    补充字段 = ['dtprofit_to_profit', 'q_profit_to_gr', 'q_profit_yoy', 'q_gsprofit_margin']
    cols = ['ann_date'] + 补充字段
    dd4 = pro.fina_indicator(ts_code=code, fields=','.join(cols))
    dd4 = dd4.sort_values(by=['ann_date']).reset_index(drop=True)
    dd4 = dd4.drop_duplicates(subset=['ann_date'], keep='last').reset_index(drop=True)
    dd4.rename(columns={'ann_date': 'datetime'}, inplace=True)
    dd = pd.merge(dd, dd4, on=['datetime'], how='outer')
    dd = dd.sort_values(['datetime']).reset_index(drop=True)
    dd_dates = pro.trade_cal(exchange='SSE', start_date=dd.loc[0, 'datetime'],
                             end_date=datetime.strftime(datetime.now(), '%Y%m%d'))
    dd_dates = dd_dates[['cal_date']]
    dd_dates.columns = ['datetime']
    dd = pd.merge(dd, dd_dates, on=['datetime'], how='right').fillna(method='ffill')
    return dd


def SMA(series_1, N, M):
    '''

    :param series_1: 一个pd.Series
    :param N:窗口长度，求均值时被当作分母
    :param M:对新值分配的权重
    :return:一个数组
    '''
    pp_valid = []
    list_1 = list(series_1)
    pp_nan = []
    for i in range(len(list_1)):
        if np.isnan(list_1[i]):
            pp_nan.append(np.nan)
        else:
            pp_valid = list_1[i:]
            break

    if len(pp_valid) == 0:
        return [np.nan] * len(series_1)

    pp = [pp_valid[0]]
    for i in range(1, len(pp_valid)):
        past = pp[-1]
        pp.append((M * pp_valid[i] + (N - M) * past) / N)
    return pp_nan + pp


def factors1(code):
    dd_index = pro.index_monthly(ts_code='000001.SH')
    dd_index.rename(columns={'trade_date': 'datetime'}, inplace=True)
    dd_index = dd_index.sort_values(by=['datetime']).reset_index(drop=True)
    dd_code = get_adj_data(code=code, freq='monthly')
    dd_code.rename(columns={'trade_date': 'datetime'}, inplace=True)
    dd_code['datetime'] = dd_code['datetime'].astype(str)
    dd_index['datetime'] = dd_index['datetime'].astype(str)
    dd = pd.merge(dd_code[['datetime', 'close']], dd_index[['datetime', 'close']], on=['datetime'], how='right').fillna(
        method='ffill').dropna()
    dd.columns = ['datetime', 'code', 'index']
    dd = dd.reset_index(drop=True)
    dd['code_ret'] = dd['code'].pct_change()
    dd['index_ret'] = dd['index'].pct_change()

    if len(dd) > 60:
        p_alpha = [np.nan] * 60
        p_beta = [np.nan] * 60
        for ii in range(60, len(dd)):
            x = dd.loc[ii - 60 + 1:ii, 'code_ret']
            y = dd.loc[ii - 60 + 1:ii, 'index_ret']

            x = sm.add_constant(x)
            model = sm.OLS(y, x)
            result = model.fit()
            p_alpha.append(result.params['const'])
            p_beta.append(result.params['code_ret'])

        dd['HAlpha'] = p_alpha
        dd['beta'] = p_beta
    else:
        dd['HAlpha'] = np.nan
        dd['beta'] = np.nan

    for N in [1, 3, 6, 12]:
        dd[f'return_{N}m'] = (dd['code'] - dd['code'].shift(N)) / (dd['code'].shift(N))

    dd['month'] = dd['datetime'].apply(lambda x: x[:6])
    return dd


def factors2(folder, code):
    file = f'{code}.csv'
    dd = pd.read_csv(f'{folder}/{file}')
    dd['datetime'] = dd['datetime'].astype(str)

    dd0 = get_adj_data(code, freq='daily')
    dd0 = dd0[['trade_date', 'close']]
    dd0.columns = ['datetime', 'adj_close']
    dd = pd.merge(dd, dd0, on=['datetime'], how='right')[['datetime', 'adj_close', 'turnover_rate', 'total_mv']]
    dd = dd.dropna()

    exp_func = lambda x: (x.values * np.exp(-np.arange(N * 21)[::-1] / 12 / 4)).sum()
    for N in [1, 3, 6, 12]:
        dd[f'turn_{N}m'] = dd['turnover_rate'].rolling(21 * N).mean()
        dd[f'bias_turn_{N}m'] = dd['turnover_rate'].rolling(21 * N).mean() / dd['turnover_rate'].rolling(
            21 * 12 * 2).mean()
        dd[f'wgt_return_{N}m'] = (dd['adj_close'].pct_change() * dd['turnover_rate']).rolling(21 * N).mean()
        dd[f'exp_wgt_return_{N}m'] = (dd['adj_close'].pct_change() * dd['turnover_rate']).rolling(21 * N).apply(
            exp_func)

    dd['ln_capital'] = np.log(dd['total_mv'])
    dd['month'] = dd['datetime'].apply(lambda x: x[:6])
    dd = dd.sort_values(by=['datetime']).reset_index(drop=True)
    dd = dd.drop(['datetime'], axis=1)
    dd = dd.drop_duplicates(subset=['month'], keep='last')
    return dd


def factors3(code):
    dd_index = pro.index_daily(ts_code='000001.SH')
    dd_index.rename(columns={'trade_date': 'datetime'}, inplace=True)
    dd_index = dd_index.sort_values(by=['datetime']).reset_index(drop=True)
    dd_code = get_adj_data(code=code, freq='daily')
    dd_code.rename(columns={'trade_date': 'datetime'}, inplace=True)
    dd_code['datetime'] = dd_code['datetime'].astype(str)
    dd_index['datetime'] = dd_index['datetime'].astype(str)
    dd = pd.merge(dd_code[['datetime', 'close']], dd_index[['datetime', 'close']], on=['datetime'], how='right').fillna(
        method='ffill').dropna()
    dd.columns = ['datetime', 'code', 'index']
    dd = dd.sort_values(by=['datetime']).reset_index(drop=True)
    dd['code_ret'] = dd['code'].pct_change()

    dd['ln_price'] = np.log(dd['code'])

    for N in [1, 3, 6, 12]:
        dd[f'std_{N}m'] = dd['code_ret'].rolling(21 * N).std()

    dd['DIF'] = dd['code'].ewm(span=15).mean() - dd['code'].ewm(span=30).mean()
    dd['DEA'] = dd['DIF'].ewm(span=10).mean()
    dd['MACD'] = dd['DIF'] - dd['DEA']

    dd['RSI'] = 100 * pd.Series(SMA(np.maximum(dd['code'] - dd['code'].shift(1), 0), 20, 1)) / pd.Series(
        SMA(np.abs(dd['code'] - dd['code'].shift(1)), 20, 1))
    dd['PSY'] = dd['code_ret'].rolling(20).apply(lambda x: len(np.where(x.values > 0)[0]) / 20)
    dd['BIAS'] = (dd['code'] - dd['code'].rolling(20).mean()) / (dd['code'].rolling(20).mean())
    dd['month'] = dd['datetime'].apply(lambda x: x[:6])
    dd = dd.sort_values(by=['datetime']).reset_index(drop=True)
    dd = dd.drop(['datetime', 'code', 'index', 'code_ret'], axis=1)
    dd = dd.drop_duplicates(subset=['month'], keep='last')
    return dd


def factors4(folder, code):
    file = f'{code}.csv'
    dd = pd.read_csv(f'{folder}/{file}')
    dd['datetime'] = dd['datetime'].astype(str)
    dd = addfield(dd, code)

    # 自有的或简单计算即可获得的因子
    Sales_G_q = 'or_yoy'
    Profit_G_q = 'netprofit_yoy'
    OCF_G_q = 'cfps_yoy'
    ROE_G_q = 'roe_yoy'

    dd['EP'] = 1 / dd['pe_ttm']
    dd['EPcut'] = dd['profit_dedt'] / dd['total_mv']
    dd['BP'] = 1 / dd['pb']
    # SP=营业总收入/总市值=1/((净利润/营业总收入)*(总市值/净利润))=1/(profit_to_gr*pe_ttm)
    dd['SP'] = (dd['profit_to_gr'] * dd['pe_ttm']) / 100
    dd['NCFP'] = dd['total_share'] * dd['cfps'] / dd['total_mv']
    dd['OCFP'] = dd['total_share'] * dd['ocfps'] / dd['total_mv']
    dd['DP'] = dd['dv_ttm'] / dd['total_mv']
    dd['G/PE'] = dd['q_profit_yoy'] / dd['pe_ttm']

    # 杠杆因子
    dd['financial_leverage'] = dd['assets_to_eqt']
    dd['current_ratio'] = dd['current_ratio']
    dd['debtequityratio'] = dd['total_ncl'] / (dd['total_assets'] - dd['total_liab'])
    dd['cashratio'] = dd['cash_ratio']

    ROE_q = 'roe'
    ROE_ttm = 'roe_yearly'
    ROA_q = 'roa'
    ROA_ttm = 'roa_yearly'
    grossprofitmargin_q = 'q_gsprofit_margin'
    grossprofitmargin_ttm = 'grossprofit_margin'

    # 扣除非经常损益后的净利润/净利润 * 净利润/营业总收入 = 扣除非经常损益后的净利润/营业总收入 = dtprofit_to_profit*profit_to_gr
    dd['profitmargin_q'] = dd['dtprofit_to_profit'] * dd['q_profit_to_gr']
    dd['profitmargin_ttm'] = dd['dtprofit_to_profit'] * dd['profit_to_gr']

    dd['assetturnover_ttm'] = dd['assets_turn']  # 季度的资产周转率找不到
    dd['operationcashflowratio_q'] = dd['ocfps'] / dd['q_npta']
    dd['operationcashflowratio_ttm'] = dd['ocfps'] / dd['npta']

    成长因子 = [Sales_G_q, Profit_G_q, OCF_G_q, ROE_G_q]
    估值因子 = ['EP', 'EPcut', 'BP', 'SP', 'NCFP', 'OCFP', 'DP', 'G/PE']
    杠杆因子 = ['financial_leverage', 'current_ratio', 'debtequityratio', 'cashratio']
    财务质量因子 = [ROE_q, ROE_ttm, ROA_q, ROA_ttm, grossprofitmargin_q, grossprofitmargin_ttm,
              'profitmargin_q', 'profitmargin_ttm', 'assetturnover_ttm', 'operationcashflowratio_q',
              'operationcashflowratio_ttm']

    dd = dd[['datetime'] + 成长因子 + 估值因子 + 杠杆因子 + 财务质量因子]
    dd['month'] = dd['datetime'].apply(lambda x: x[:6])
    dd = dd.sort_values(by=['datetime'])
    dd = dd.drop(['datetime'], axis=1)
    dd = dd.drop_duplicates(subset=['month'], keep='last')
    return dd


def get_all_factors(folder, code, start_date, end_date):
    # 计算HAlpha,beta,return_Nm
    # 直接基于月频数据行情数据计算，没有财务，没有日频
    dd1 = factors1(code)

    # 计算turn_Nm,bias_turn_Nm,wgt_return_Nm,exp_wgt_return_Nm
    # 计算各种换手率相关的因子以及市值因子
    dd2 = factors2(folder, code)

    # 计算ln_price,std_Nm,DIF,DEA,MACD,RSI,PSY,BIAS
    # 不涉及到保存的基础数据文件夹中数据的日频行情因子
    dd3 = factors3(code)

    # 计算各种财务类的数据，会用到之前准备的基础数据
    dd4 = factors4(folder, code)

    if len(dd1) > 0 and len(dd2) > 0 and len(dd3) > 0 and len(dd4) > 0:
        ddd = pd.merge(dd1, dd2, on=['month'])
        ddd = pd.merge(ddd, dd3, on=['month'])
        ddd = pd.merge(ddd, dd4, on=['month'])
        ddd['datetime'] = ddd['datetime'].astype(str)
        ddd = ddd[(ddd['datetime'] >= start_date) & (ddd['datetime'] <= end_date)].reset_index(drop=True)
        ddd.index = pd.to_datetime(ddd['datetime'])
        ddd = ddd.drop(['datetime', 'month'], axis=1)
    else:
        ddd = pd.DataFrame()
    return ddd


def de_extreme(ddd):
    """
    中位数去极值
    进来的和出去的df都必须是纯数值，因此时间或日期列必须弃掉，用索引装载
    """
    DM = ddd.median()
    DM1 = (ddd - DM).abs().median()
    上限 = DM + 5 * DM1
    下限 = DM - 5 * DM1
    ddd = (ddd > 上限) * 上限 + (ddd < 下限) * 下限 + ((ddd >= 下限) & (ddd <= 上限)) * ddd
    return ddd
